In [1]:
import feedparser
import re
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import string
from collections import Counter
stop_words = set(stopwords.words('english'))
stop_words.update(string.punctuation)
In [2]:
def getwords(html):
'''
Remove HTML, tokenize and lower the case
'''
txt = re.compile(r'<[^>]+>').sub('',html)
word_list=[i.lower() for i in wordpunct_tokenize(txt) if i.lower() not in stop_words]
return word_list
In [3]:
def getwordcounts(url):
'''
Returns list of blog posts
'''
d = feedparser.parse(url)
wc = {}
summary = []
for e in d.entries:
if 'summary' in e:
summary.append(e.title + e.summary)
else:
summary.append(e.title + e.description)
return summary
In [4]:
summary = getwordcounts('https://sethuiyer.wordpress.com/feed/atom/')
In [5]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=getwords,
max_df=0.5,
min_df=0.1,
lowercase=True)
tfidf_model = vectorizer.fit_transform(summary)
In [6]:
import collections
km_model = KMeans(n_clusters=3)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
In [7]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, random_state=1,
alpha=.1, l1_ratio=.5).fit(tfidf_model)
feature_names = vectorizer.get_feature_names()
def print_top_words(model, feature_names, n_top_words=2):
topic_list=[]
for topic_idx, topic in enumerate(model.components_):
topic_list.append(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
return topic_list
In [8]:
topic_list=print_top_words(nmf, vectorizer.get_feature_names(), 2)
In [9]:
for i in range(3):
print("Topic Name: ",topic_list[i])
print("Documents in the cluster: ",clustering[i])
print('----------------')